VPNconnect: vpn.uoregon.edu, login ID

Mount to Talapas to access raw data (via Terminal Window)

#sshfs :/projects/niell/dniell /Users/deniseniell/talapas_dniell #cd /Users/deniseniell/talapas_dniell/cellranger

Open Rmd file (the following commands will be in RStudio)

Set your working directory path

#setwd("/Users/deniseniell/Desktop/Seurat/run2")
rm(list = ls())

Load libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Note: you might get an error when loading dplyr but if you just run the command again, then it will load and everything should work well
library(Seurat)
library(Matrix)
library(ggplot2)
library(sctransform)
library(stringr)

Load the Octo Seq raw data ~2m

Keep track of which data set you are working with. The code is written so that you can run through the pipeline without having to worry about changing variables (i.e. data sets are imported to the variable “all” so that all the following commands will process the data without you needing to change the variable each time)

all <- Read10X(data.dir = "D:/data/octo seq/Cellranger/OctoSeq2.1/raw_feature_bc_matrix")

replace gene names in raw data with IDs from ccsv file

takes a few minutes

ref <- read.csv("D:/data/octo seq/refMaster_040420.csv",stringsAsFactors=FALSE)

ngenes <- length(all@Dimnames[[1]])
for (g in 1:ngenes){
  gene<-all@Dimnames[[1]][g]
  gene<-substr(gene,6,str_length(gene)-2)
  ind<-grep(gene,ref[[1]])
  if (length(ind)>0) {
    id <- ref[[ind[1],2]]
    if (str_length(id)>0) {
      id <- str_remove_all(id,"\\(") # parentheses mess up gene names as dimensions
      id <- str_remove_all(id,"\\)")
      id <- substr(id,1,60) # keep it short
      all@Dimnames[[1]][g]<- paste(id,gene,sep='-')
    }
  }
}

option to read in RDS file here

#all <- readRDS(file = "/Users/deniseniell/Desktop/Seurat/run2/OSmarkersTree.rds")

Initialize the Seurat object with the raw (non-normalized data).

#Change the project name here so that you can keep track of your data

all <- CreateSeuratObject(counts = all, project = "OctoSeq2_names", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
## Warning: Feature names cannot have pipe characters ('|'), replacing with dashes
## ('-')

Begin Quality Control Steps

mito.genes <- grep(pattern = "^mt-", x = rownames(x = all), value = TRUE)
percent.mito <- Matrix::colSums(all) / Matrix::colSums(all)
all[["percent.mt"]] <- PercentageFeatureSet(all, pattern = "^MT-")
all <- subset(all, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5 & nCount_RNA>200)
plot1 <- FeatureScatter(all, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(all, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))

Number of unique genes and total moleculares are automatically calculated during CreateSeuratObject, but you can view these metrics because they are stored in the object meta data

head(all@meta.data, 5)
##                      orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACCCAAGAGTACCG OctoSeq2_names        258          206          0
## AAACCCAAGCATTGTC OctoSeq2_names       3058         1639          0
## AAACCCAAGCTCCCTT OctoSeq2_names        628          521          0
## AAACCCAAGGTATCTC OctoSeq2_names       1462          997          0
## AAACCCAAGTAATTGG OctoSeq2_names        437          346          0
median(all@meta.data$nCount_RNA)
## [1] 1014.5
median(all@meta.data$nFeature_RNA)
## [1] 725

You can also visualize these QC metrics and use this to decide how to filter cells

VlnPlot(all, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol=3)

# Begin normalizing the data

all <- NormalizeData(all, normalization.method = "LogNormalize", scale.factor = 10000)

You can also use the following command and you should receive the same output

#all <- NormalizeData(all)

Identification of highly variable features (feature selection), and plot these features

all <- FindVariableFeatures(all, selection.method = "vst", nfeatures = 2000)

Identify the 10 most highly variable genes

top10 <- head(VariableFeatures(all),10)

Plot variable features with and without labels

plot3 <- VariableFeaturePlot(all)
plot4 <- LabelPoints(plot = plot3, points = top10, repel = TRUE)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
## When using repel, set xnudge and ynudge to 0 for optimal results
#CombinePlots(plots = list(plot3, plot4))

Apply a linear transformation (scaling), which is a standard pre-processing step prior to dimensional reduction techniques like PCA ~2m

all.genes <- rownames(all)
all <- ScaleData(all, features = all.genes)
## Centering and scaling data matrix

Perform linear dimensional reduction (PCA) ~15-20m

I normally will create a new variable/Seurat object at this point and rename it as “all.norm” so that I know that I have all of the preprocesed data prior to running PCA, and I can manipulate the PCs etc moving forward but still be able to go back to the processed data easily

all.norm <- RunPCA(all, features = VariableFeatures(object = all), npcs = 100)
## PC_ 1 
## Positive:  No hits-Ocbimv22033271m, Scavenger receptor cysteine rich domain lysyl oxidase like 3-Ocbimv22009850m, No hits-Ocbimv22032145m, Matrixin metallopeptidase-Ocbimv22031750m, No hits-Ocbimv22027477m, gene:Ocbimv22016173m.g, von Willebrand factor-Ocbimv22011288m, AChE-Ocbimv22038398m, No hits squid hit-Ocbimv22006272m, No hits squid hit-Ocbimv22021234m 
##     Cytochrome C somatic-Ocbimv22015949m, methylmalonic-aciduria-cobalamin-deficiency-cblC-type-with-h-Ocbimv22037026m, No hits squid hit-Ocbimv22039757m, gene:Ocbimv22021233m.g, No hits-Ocbimv22032142m, Thyroglobulin-Ocbimv22039703m, No hits-Ocbimv22012929m, No hits-Ocbimv22016862m, SOUL-heme-binding-protein-Ocbimv22031901m, No hits squid hit-Ocbimv22028019m 
##     PLAC8-family-Ocbimv22016971m, Astacin ShK domain like tolloid like-Ocbimv22025489m, Multicopper-oxidase-hephaestin-Ocbimv22036318m, kyphoscoliosis-peptidase-Ocbimv22013362m, gene:Ocbimv22031586m.g, Innexin-Ocbimv22027764m, No hits-Ocbimv22018192m, gene:Ocbimv22032148m.g, gene:Ocbimv22022617m.g, Innexin-Ocbimv22027762m 
## Negative:  Neuroendocrine hormone precursor-Ocbimv22007151m, gene:Ocbimv22000523m.g, gene:Ocbimv22009865m.g, Kazal-type-serine-protease-inhibitor-domain---Immunoglobulin-Ocbimv22029399m, NAD/NADP-octopine/nopaline-dehydrogenase-alpha-helical-domai-Ocbimv22006362m, EF-hand--EF-hand------EF-hand---calmodulin-1-phosphorylase-k-Ocbimv22032213m, Caprin family member-Ocbimv22030377m, gene:Ocbimv22001092m.g, gene:Ocbimv22022323m.g, No hits-Ocbimv22024117m 
##     No hit squid hit-Ocbimv22001854m, gene:Ocbimv22019384m.g, No hits squid and invert hits-Ocbimv22028115m, Cadherin-Ocbimv22025965m, No hit-Ocbimv22011989m, Subtilase-family-Proprotein-convertase-P-domain-proprotein-c-Ocbimv22033366m, Lamin intermediate filament protein-Ocbimv22000068m, Unknown function-Ocbimv22030969m, No hits-Ocbimv22005243m, LICD protein family-Ocbimv22030344m 
##     No hits squid and invert hits-Ocbimv22014073m, No hit squid and invert hits-Ocbimv22014608m, Stathmin-family-Stathmin-family-Stathmin-family-stathmin-1-Ocbimv22036611m, Collagen-triple-helix-repeat-20-copies-Collagen-triple-helix-Ocbimv22039751m, Copper type 2 dependent ascorbate dependent monoxygenase-Ocbimv22015079m, gene:Ocbimv22009322m.g, gene:Ocbimv22000524m.g, No hits at all-Ocbimv22024372m, CUB-domain-CUB-domain-containing-protein-2-Ocbimv22012935m, Deoxyribonuclease-Ocbimv22015857m 
## PC_ 2 
## Positive:  Thymosin-beta-4-family-Thymosin-beta-4-family-Thymosin-beta--Ocbimv22038101m, gene:Ocbimv22035787m.g, Innexin-Ocbimv22016460m, gene:Ocbimv22030005m.g, Cadherin-domain-Cadherin-domain-Cadherin-domain-Cadherin-dom-Ocbimv22039316m, No hit Squid hit-Ocbimv22033868m, Aquaporin-Ocbimv22038019m, Cadherin-like-Cadherin-domain-Cadherin-domain-Cadherin-domai-Ocbimv22020837m, Innexin-Ocbimv22016464m, Low density lipoprotein receptor-Ocbimv22000216m 
##     No hits squid hit-Ocbimv22006271m, Collagen triple helix repeat-Ocbimv22035245m, Rhodanese-like-domain-Dual-specificity-phosphatase-catalytic-Ocbimv22013786m, Ankyrin repeat/Notch?-Ocbimv22007283m, Insulin-like growth factor binding protein-Ocbimv22024561m, No hits squid and invert hits-Ocbimv22037269m, slit-homolog-2-Drosophila-Ocbimv22018812m, Sema-domain-Plexin-repeat-IPT/TIG-domain-IPT/TIG-domain-IPT/-Ocbimv22006562m, Cysteine Sulfinic Acid Decarboxylase-Ocbimv22004812m, Zinc-finger-C2H2-type--zinc-finger-protein-285B-pseudogene-Ocbimv22037537m 
##     gene:Ocbimv22032467m.g, Stanniocalcin-family-Ocbimv22031945m, Lipoprotein-amino-terminal-region-Domain-of-unknown-function-Ocbimv22023313m, AchR-Ocbimv22006182m, Tetraspanin family molecule-Ocbimv22038024m, EGFNotch?-Ocbimv22021827m, No hits-Ocbimv22024563m, Vault-protein-inter-alpha-trypsin--von-Willebrand-factor-typ-Ocbimv22014896m, bZIP-transcription-factor-Basic-region-leucine-zipper-bZIP-M-Ocbimv22024895m, GPCR-Ocbimv22039826m 
## Negative:  Neuroendocrine hormone precursor-Ocbimv22007151m, gene:Ocbimv22001092m.g, LICD protein family-Ocbimv22030344m, No hit squid and invert hits-Ocbimv22014608m, gene:Ocbimv22031183m.g, Kazal-type-serine-protease-inhibitor-domain---Immunoglobulin-Ocbimv22029399m, TB2/DP1-HVA22-family-receptor-accessory-protein-5-Ocbimv22006009m, gene:Ocbimv22000523m.g, EF-hand--EF-hand------EF-hand---calmodulin-1-phosphorylase-k-Ocbimv22032213m, Triosephosphate-isomerase-triosephosphate-isomerase-1-Ocbimv22037419m 
##     Universal-stress-protein-family-Ocbimv22030262m, Subtilase-family-Proprotein-convertase-P-domain-proprotein-c-Ocbimv22033366m, Carbohydrate-phosphorylase-phosphorylase-glycogen;-brain-Ocbimv22020127m, Enolase-N-terminal-domain-Enolase-C-terminal-TIM-barrel-doma-Ocbimv22035502m, Copper type 2 dependent ascorbate dependent monoxygenase-Ocbimv22015079m, gene:Ocbimv22000524m.g, gene:Ocbimv22001641m.g, gene:Ocbimv22022323m.g, Fibrinogen like-Ocbimv22002134m, VMAT A-Ocbimv22031489m 
##     Stathmin-family-Stathmin-family-Stathmin-family-stathmin-1-Ocbimv22036611m, Ndr-family--N-myc-downstream-regulated-1-Ocbimv22005038m, Zinc-finger-C3HC4-type-RING-finger--Ocbimv22009276m, NAD/NADP-octopine/nopaline-dehydrogenase-alpha-helical-domai-Ocbimv22006362m, gene:Ocbimv22024976m.g, gene:Ocbimv22031184m.g, Helix-loop-helix-DNA-binding-domain-Ocbimv22004730m, Sulfotransferase Tango13 Transport-Ocbimv22015376m, Cadherin-Ocbimv22025965m, CUB-domain-CUB-domain-containing-protein-2-Ocbimv22012935m 
## PC_ 3 
## Positive:  Insulin-like growth factor binding protein-Ocbimv22024561m, Low density lipoprotein receptor-Ocbimv22000216m, Collagen triple helix repeat-Ocbimv22035245m, No hits squid hit-Ocbimv22006271m, Collagen-triple-helix-repeat-20-copies-Collagen-triple-helix-Ocbimv22031313m, Aquaporin-Ocbimv22038019m, Ankyrin repeat/Notch?-Ocbimv22007283m, No hits-Ocbimv22024563m, No hits squid and invert hits-Ocbimv22037269m, gene:Ocbimv22030005m.g 
##     Collagen-triple-helix-repeat-20-copies-Collagen-triple-helix-Ocbimv22023946m, Eukaryotic type carbonic anhydrase-Ocbimv22023622m, AchR-Ocbimv22006182m, EGFNotch?-Ocbimv22021827m, Collagen triple helix repeat-Ocbimv22027357m, Tetraspanin family molecule-Ocbimv22038024m, collagen-type-IV-alpha-3-Goodpasture-antigen-Ocbimv22032747m, Lipoprotein-amino-terminal-region-Domain-of-unknown-function-Ocbimv22023313m, Vault-protein-inter-alpha-trypsin--von-Willebrand-factor-typ-Ocbimv22016107m, Vault-protein-inter-alpha-trypsin--von-Willebrand-factor-typ-Ocbimv22014896m 
##     Fibrinogen-beta-and-gamma-chains-C-terminal-globular-domain--Ocbimv22024607m, No hit Squid hit-Ocbimv22033868m, No hits squid hit-Ocbimv22006273m, Innexin-Ocbimv22016460m, 7TMR latrophil?-Ocbimv22005132m, Beta lactamase-Ocbimv22011235m, C2H2-Ocbimv22012776m, Fibronectin-Ocbimv22011001m, Cysteine-rich-secretory-protein-family-GLI-pathogenesis-rela-Ocbimv22013028m, No hits squid hit-Ocbimv22000497m 
## Negative:  Scavenger receptor cysteine rich domain lysyl oxidase like 3-Ocbimv22009850m, No hits-Ocbimv22032145m, von Willebrand factor-Ocbimv22011288m, No hits squid hit-Ocbimv22006272m, methylmalonic-aciduria-cobalamin-deficiency-cblC-type-with-h-Ocbimv22037026m, No hits squid hit-Ocbimv22018450m, gene:Ocbimv22032148m.g, von Willebrand factor-Ocbimv22012241m, No hits squid hit-Ocbimv22021232m, No hits squid hit-Ocbimv22006708m 
##     Multicopper-oxidase-hephaestin-Ocbimv22036318m, No hits-Ocbimv22032142m, No hits squid hit-Ocbimv22018451m, Polysaccharide deacetylase-Ocbimv22030571m, No hits squid hit-Ocbimv22021234m, Multicopper-oxidase-hephaestin-like-1-Ocbimv22013232m, gene:Ocbimv22031586m.g, No hits-Ocbimv22037232m, Multicopper oxidase hepaestin-Ocbimv22036319m, Beta lactamase-Ocbimv22037424m 
##     GPCR Rhodopsin family but not one of our opsins-Ocbimv22017246m, No hits-Ocbimv22033271m, Matrixin metallopeptidase-Ocbimv22031751m, No hits-Ocbimv22027477m, No hits-Ocbimv22002030m, SOUL-heme-binding-protein-Ocbimv22031901m, Calmodulin-Ocbimv22019794m, gene:Ocbimv22010514m.g, GPCR Rhodopsin family but not one of our opsins-Ocbimv22001500m, No hits-Ocbimv22016862m 
## PC_ 4 
## Positive:  von Willebrand factor-Ocbimv22012241m, No hits squid hit-Ocbimv22018450m, No hits squid hit-Ocbimv22018451m, Stanniocalcin-family-Ocbimv22031945m, GPCR Rhodopsin family but not one of our opsins-Ocbimv22001500m, GPCR Rhodopsin family but not one of our opsins-Ocbimv22017246m, No hits squid hit-Ocbimv22021232m, Innexin-Ocbimv22027764m, No hits-Ocbimv22032137m, Astacin ShK domain like tolloid like-Ocbimv22016617m 
##     Polysaccharide deacetylase-Ocbimv22030571m, Thyroglobulin-Ocbimv22039703m, Beta lactamase-Ocbimv22037424m, Thyroglobulin-Ocbimv22033538m, Putative peptidoglycan Mmp1-Ocbimv22002743m, Thyroglobulin-type-1-repeat-Ocbimv22033536m, Perilipin-Ocbimv22023930m, EB-module-EB-module-Protein-tyrosine-phosphatase-Dual-specif-Ocbimv22033209m, methylmalonic-aciduria-cobalamin-deficiency-cblC-type-with-h-Ocbimv22037026m, Galactoside-binding-lectin-Galactoside-binding-lectin-Galact-Ocbimv22037980m 
##     von Willebrand factor-Ocbimv22011288m, Carbon-nitrogen-hydrolase-vanin-3-Ocbimv22026433m, No hits squid hit-Ocbimv22006271m, Cytochrome C somatic-Ocbimv22015949m, Immunoglobulin-I-set-domain--kin-of-IRRE-like-Drosophila-Ocbimv22005431m, Astacin-Peptidase-family-M12A-CUB-domain-CUB-domain-Calcium--Ocbimv22015290m, gene:Ocbimv22032148m.g, No hits-Ocbimv22002030m, Matrixin metallopeptidase-Ocbimv22031751m, Multicopper-oxidase-Multicopper-oxidase-Multicopper-oxidase-Ocbimv22012783m 
## Negative:  von-Willebrand-factor-type-A-domain--Ocbimv22013060m, linker-histone-H1-and-H5-family-H1-histone-family-member-0-Ocbimv22034699m, linker-histone-H1-and-H5-family-H1-histone-family-member-0-Ocbimv22034700m, RhoGAP-domain-Ocbimv22012861m, nucleolar-and-spindle-associated-protein-1-Ocbimv22039446m, Cell-cycle-regulated-microtubule-associated-protein-TPX2-mic-Ocbimv22033524m, gene:Ocbimv22032139m.g, von-Willebrand-factor-type-A-domain-Ocbimv22013063m, Cathepsin-Ocbimv22009822m, Core-histone-H2A/H2B/H3/H4-Histone-like-transcription-factor-Ocbimv22004229m 
##     gene:Ocbimv22000757m.g, Protein-kinase-domain-Protein-tyrosine-kinase--POLO-box-dupl-Ocbimv22028963m, Core-histone-H2A/H2B/H3/H4-histone-cluster-2-H3d-Ocbimv22001065m, Protein-kinase-domain-Protein-tyrosine-kinase--aurora-kinase-Ocbimv22010207m, linker-histone-H1-and-H5-family-H1-histone-family-member-0-Ocbimv22026507m, LIM-domain-xin-actin-binding-repeat-containing-2-Ocbimv22009265m, Timeless-protein-Timeless-protein-C-terminal-region-timeless-Ocbimv22004280m, gene:Ocbimv22011688m.g, Rad51--recA-bacterial-DNA-recombination-protein-KaiC-RAD51-h-Ocbimv22024630m, Kinesin-motor-domain-Tesmin/TSO1-like-CXC-domain-kinesin-fam-Ocbimv22002108m 
##     Core-histone-H2A/H2B/H3/H4-Histone-like-transcription-factor-Ocbimv22004225m, gene:Ocbimv22009747m.g, Kinesin-motor-domain-kinesin-family-member-20A-Ocbimv22017211m, Kinesin-motor-domain-centromere-protein-E-312kDa-Ocbimv22025596m, Importin-beta-binding-domain-Armadillo/beta-catenin-like-rep-Ocbimv22005104m, Nbl1-/-Borealin-N-terminal-Cell-division-cycle-associated-pr-Ocbimv22026418m, gene:Ocbimv22005812m.g, Protein-kinase-domain-Protein-tyrosine-kinase-Ocbimv22003693m, Multicopper-oxidase-Multicopper-oxidase-Multicopper-oxidase-Ocbimv22004288m, Barren-protein-non-SMC-condensin-I-complex-subunit-H-Ocbimv22001662m 
## PC_ 5 
## Positive:  Collagen-triple-helix-repeat-20-copies-Collagen-triple-helix-Ocbimv22031313m, gene:Ocbimv22035787m.g, Collagen triple helix repeat-Ocbimv22035245m, Eukaryotic type carbonic anhydrase-Ocbimv22023622m, Low density lipoprotein receptor-Ocbimv22000216m, Collagen-triple-helix-repeat-20-copies-Collagen-triple-helix-Ocbimv22023946m, Ribosomal-protein-S5-N-terminal-domain-Ribosomal-protein-S5--Ocbimv22018401m, Fibronectin-Ocbimv22011001m, slit-homolog-2-Drosophila-Ocbimv22018812m, collagen-type-IV-alpha-3-Goodpasture-antigen-Ocbimv22032747m 
##     Collagen triple helix repeat-Ocbimv22027357m, Nucleoside-diphosphate-kinase-NME1-NME2-readthrough-Ocbimv22012462m, No hits squid hit-Ocbimv22020075m, Core-histone-H2A/H2B/H3/H4-H2A-histone-family-member-V-Ocbimv22011053m, No hits squid hit-Ocbimv22006273m, No hits-Ocbimv22006768m, Platelet derived growth factor-Ocbimv22015194m, Eukaryotic-type-carbonic-anhydrase-Ocbimv22023623m, No hits squid hit-Ocbimv22031576m, Cysteine-rich-secretory-protein-family-GLI-pathogenesis-rela-Ocbimv22013028m 
##     Profilin-Ocbimv22036924m, Beta Tubulin-Ocbimv22029847m, Ribosomal-protein-L14-Ocbimv22009408m, Cadherin-domain-Cadherin-domain-Cadherin-domain-Cadherin-dom-Ocbimv22039316m, Zinc-finger-C2H2-type--CCCTC-binding-factor-zinc-finger-prot-Ocbimv22008402m, Sema-domain-Plexin-repeat-IPT/TIG-domain-IPT/TIG-domain-IPT/-Ocbimv22006562m, 60s-Acidic-ribosomal-protein-ribosomal-protein-large-P1-Ocbimv22024382m, 60s-Acidic-ribosomal-protein-ribosomal-protein-large-P2-Ocbimv22010428m, Ribosomal-protein-L11-N-terminal-domain-Ribosomal-protein-L1-Ocbimv22027712m, Aminopeptidase-Ocbimv22002564m 
## Negative:  No hit Squid hit-Ocbimv22033868m, Lipoprotein-amino-terminal-region-Domain-of-unknown-function-Ocbimv22023313m, No hit one weak to Voltage dependent calcium channel-Ocbimv22022356m, No hit-Ocbimv22026896m, Citron Rho interacting serine/threonine kinase-Ocbimv22027688m, Stanniocalcin-family-Ocbimv22031945m, EF-hand----calmodulin-3-phosphorylase-kinase-delta-Ocbimv22007343m, Intermediate-filament-protein-Fez1-Intermediate-filament-pro-Ocbimv22025461m, FRAS1-related-extracellular-matrix-protein-2-Ocbimv22026849m, AMPA-like-Ocbimv22039454m 
##     Beta lactamase-Ocbimv22011235m, Lamin intermediate filament protein-Ocbimv22000068m, Lipoprotein-amino-terminal-region-Domain-of-unknown-function-Ocbimv22014214m, Innexin-Ocbimv22016460m, Glycosyl-hydrolases-family-38-C-terminal-domain-mannosidase--Ocbimv22005050m, EGF-like-domain---Immunoglobulin-I-set-domain--Immunoglobuli-Ocbimv22036785m, Kazal-type-serine-protease-inhibitor-domain-Kazal-type-serin-Ocbimv22017255m, Caprin family member-Ocbimv22030377m, Potassium channel TWiK family-Ocbimv22011680m, No hits squid hit-Ocbimv22037830m 
##     No hit Squid hit-Ocbimv22014258m, Vault-protein-inter-alpha-trypsin--von-Willebrand-factor-typ-Ocbimv22014896m, Matrix metallopeptidase-Ocbimv22023633m, No hit Squid hit-Ocbimv22036586m, DAT-Ocbimv22026818m, Olfactomedin like-Ocbimv22029262m, Glycosyl-hydrolases-family-31-KIAA1161-Ocbimv22016666m, No hits squid and invert hits-Ocbimv22014073m, Leucine-Rich-Repeat-----Leucine-Rich-Repeat---Leucine-Rich-R-Ocbimv22026653m, gene:Ocbimv22006025m.g

save data at this point just in case RStudio crashes!

#saveRDS(all.norm, file = #"/Users/deniseniell/Desktop/Seurat/run2/OSr2norm200.rds")

Visualize PCAs in a few different ways

You can skip the next few lines of code (visualization of PCs, elbow and jackstraw) if you know how many pcs you want to move forward with.

DimHeatmap(all.norm, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(all.norm, dims = 1:10, cells = 500, balanced = TRUE)

VizDimLoadings(all.norm, dims = 1:2, reduction = "pca")

DimPlot(all.norm, reduction = "pca")

# Seurat clusters cells based on their PCA scores, with each PC essentially representing a “metafeature” that combines information across a correlated feature set. Top principle components represent robust compression of the dataset. To determine how many components to include, one can utilize a resampling test inspired by the JackStraw procedure.

Use the ElbowPlot function before JackStraw to visualize and save on computing power since the JackStraw method can take a long time for processing big datasets, others rely on the ElbowPlot function to reduce computation time and still produce an approximation.

ElbowPlot(all.norm, ndims = 200)
## Warning in ElbowPlot(all.norm, ndims = 200): The object only has information for
## 100 reductions

This sampling strategy randomly permutes a subset of the data (1% is the default) ad erun PCA, constructing a “null distribution” of feature scores, and repeat this procedure. This allows an identification of “significant” PCs as those who have a strong enrichment of low p-value features.

The JackStrawPlot function provides a visualization tool for comparing the distribution of p-values for each PC with a uniform distribution (indicated by the dashed line). “Significant” PCs will show a strong enrichment of features with low p-values (indicated by the solid curve above the dashed line). ~30m

##Note: even if you run 200pcs, the max # of dims you can use for ScoreJackStraw and JackStrawPlot is 20.

#all.norm <- JackStraw(all.norm, num.replicate = 100)
#all.norm <- ScoreJackStraw(all.norm, dims = 1:20)
#JackStrawPlot(all.norm, dims = 1:20)

Cluster the cells

Seurat v3 applies a graph-based clustering approach. Briefly, these methods embed cells in a graph structure - for example a K-nearest neighbor (KNN) graph, with edges drawn between cells with similar feature expression patterns, and then attempt to partition this graph into highly interconnected “quasi-cliques” or “communities”

(like PhenoGraph), a KNN graph is constructed based on the elucidean distance in PCA space. Then, the FindNeighbors function refines edge weights between any two cells based on the shared overlap in their local neighborhoods (this is referred to as Jaccard similarity). This function takes the previously defined input regarding number of PCs (now, dims)

To cluster cells, Seurat applies a modularity optimization technique, such as Louvain algorithm (which is the default) or SLM, to iteratively group cells together, with the goal of optimizing the standard modularity function.

FindClusters implements the procedure mentioned above and contains a resolution parameter that sets “granularity” of downstream clustering, with increased values leading to a greater number of clusters. Typically, setting the parameter between 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells. Clusters could be found using the Idents function.

##Note: even if you only run 20 dims with the JackStraw function above, looks like you can still proceed with >20 dims in the following commands ~5m

all.norm <- FindNeighbors(all.norm, dims = 1:50) # changed this from 200, to be consistent with FindClusters next
## Computing nearest neighbor graph
## Computing SNN
all.norm <- FindClusters(all.norm, reduction.type = "pca", dims = 1:50, resolution = 1)
## Warning: The following arguments are not used: reduction.type, dims
## Suggested parameter: reduction instead of reduction.type
## Warning: The following arguments are not used: reduction.type, dims
## Suggested parameter: reduction instead of reduction.type
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 22066
## Number of edges: 1075527
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8868
## Number of communities: 32
## Elapsed time: 4 seconds
##Notes: 200dims, res 1 yields 18 singletons and 15 final clusters; 200dims, res 0.5 yields 18 singletons and 12 final clusters; 200dims, res 1.5 yields 29 singletons and 68 final clusters; 200dims, res 0.2 yields 18 singletons and 9 final clusters; 150dims, res 1 yields 18 singletons and 15 final clusters; 100dims, res 1 yields 18 singletons and 15 final clusters; 50dims, res 1 yields 18 singletons and 15 final clusters; 50dims, res 0.5 yields 18 singletons and 12 clusters. --> let's stick with 50dims at res 1 for now and revisit FindNeighbors parameters to address singleton issue
##Notes: Neighbors = 200 dims, Clusters = 50dims, at res 0.05 there are 18 singletons and 9 final clusters; Neighbors = 200, Clusters = 50dims, at res 0.01 there are 18 singletons and 6 final clusters

## Additional notes: for OctoSeq2, 50dims res 1 yields 1 singleton and 21 final clusters.

Look at cluster IDS of the first 5 cells

head(Idents(all.norm), 5)
## AAACCCAAGAGTACCG AAACCCAAGCATTGTC AAACCCAAGCTCCCTT AAACCCAAGGTATCTC 
##               19                2                8                2 
## AAACCCAAGTAATTGG 
##                0 
## 32 Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31

Run non-linear dimensional reduction (UMAP/tSNE)

Non-linear dimensional reduction techniques, such as tSNE and UMAP, allows one to visualize and explore the datasets. The goal of these algorithms is to learn the underlying manifold of the data in order to place similar cells together in a low-dimensional space. Cells within the graph-based clusters determined above should co-localize on these dimension reduction plots. Seurat suggests using the same PCs as what was provided as input to the clustering analysis.

all.norm <- RunUMAP(all.norm, dims = 1:50) # was 1:20 dims
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 11:51:34 UMAP embedding parameters a = 0.9922 b = 1.112
## 11:51:34 Read 22066 rows and found 50 numeric columns
## 11:51:34 Using Annoy for neighbor search, n_neighbors = 30
## 11:51:34 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 11:51:39 Writing NN index file to temp file C:\Users\CRISNI~1\AppData\Local\Temp\RtmpkdVis5\file2c405ff7778d
## 11:51:39 Searching Annoy index using 1 thread, search_k = 3000
## 11:51:48 Annoy recall = 100%
## 11:51:49 Commencing smooth kNN distance calibration using 1 thread
## 11:51:51 Initializing from normalized Laplacian + noise
## 11:51:52 Commencing optimization for 200 epochs, with 1015742 positive edges
## 11:52:17 Optimization finished
DimPlot(all.norm, reduction = "umap", label = TRUE)

# Run non-linear dimensional reduction with tSNE (UMAP preferred, so this section is edited/commented out) #OS1.15 <-RunTSNE(object = OS1.norm, dims = 1:15, do.fast = TRUE) TSNEPlot(object = OS1.15, do.label = TRUE)

#saveRDS(all.norm, file = #"/Users/deniseniell/Desktop/Seurat/run2/OSr2PCs200res50.rds")

Build a phylogenetic tree, and rename/reorder cluster names according to their position on the tree

See help for details on tree building strategy

Assigned cluster will be placed in the ‘tree.ident’ field of , and also stored in

all.normTREE <- BuildClusterTree(all.norm, reorder = TRUE, reorder.numeric = TRUE, slot = "scale.data", verbose = TRUE, dims=1:50)
## Reordering identity classes and rebuilding tree
PlotClusterTree(all.normTREE)

Run non-linear dimensional reduction (UMAP/tSNE)

Non-linear dimensional reduction techniques, such as tSNE and UMAP, allows one to visualize and explore the datasets. The goal of these algorithms is to learn the underlying manifold of the data in order to place similar cells together in a low-dimensional space. Cells within the graph-based clusters determined above should co-localize on these dimension reduction plots. Seurat suggests using the same PCs as what was provided as input to the clustering analysis.

# don't need to recalculate, just replot
#all.normTREE <- RunUMAP(all.normTREE, dims = 1:50)
DimPlot(all.normTREE, reduction = "umap", label = TRUE)

Finding differentially expressed features (for 6 clusters, ~15m)

#FindMarkers will find markers between two different identity groups - you have to specify both identity groups. This is useful for comparing the differences between two specific groups.

#FindAllMarkers will find markers differentially expressed in each identity group by comparing it to all of the others - you don’t have to manually define anything. Note that markers may bleed over between closely-related groups - they are not forced to be specific to only one group. This is what most people use (and likely what you want).

cluster.markers <- FindAllMarkers(all.normTREE, min.pct = 0.5, logfc.threshold = 0.5)
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
## Calculating cluster 21
## Calculating cluster 22
## Calculating cluster 23
## Calculating cluster 24
## Calculating cluster 25
## Calculating cluster 26
## Calculating cluster 27
## Calculating cluster 28
## Calculating cluster 29
## Calculating cluster 30
## Calculating cluster 31
## Calculating cluster 32
#write.csv(cluster.markers, file = "/Users/deniseniell/Desktop/Seurat/run2/clustermarkers_OSr2PCs200res50.csv")
#saveRDS(all.normTREE, file = #"/Users/deniseniell/Desktop/Seurat/run2/OSmarkersTree.rds")

Visualizing top 10 markers

top10 <- cluster.markers %>% group_by(cluster) %>% top_n(n = 5,wt = avg_logFC )
#top10 <- cluster.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
#DoHeatmap(all.normTREE, features = top10$gene) + NoLegend() + theme(axis.text.y = element_text(size = 5))
DotPlot(all.normTREE,features=rev(unique(top10[1:50,]$gene))) + RotatedAxis()+ theme(axis.text.x = element_text(size = 7))

DotPlot(all.normTREE,features=rev(unique(top10[51:100,]$gene))) + RotatedAxis()+ theme(axis.text.x = element_text(size = 7))

DotPlot(all.normTREE,features=rev(unique(top10[101:159,]$gene))) + RotatedAxis()+ theme(axis.text.x = element_text(size = 7))

get markers for each cluster separated out

cluster_markers <- list()

nclust = nlevels(Idents(all.normTREE))
for(i in 1:nclust) {
cluster_markers[[i]] <- cluster.markers[which(cluster.markers$cluster == i),]
   #cluster_markers[[i]] <- FindMarkers(
    # all.normTREE, ident.1 = i, min.pct=0.5, logfc.threshold = 0.5)
}

heatmap for individual clusters

for (i in 1:nclust){
  these <- cluster_markers[[i]]
  these <-these[which(these$avg_logFC>0.5),]
  these <- these[order(these$avg_logFC, decreasing = TRUE),]
  
   print(DoHeatmap(all.normTREE, features = head(these$gene,40)) + NoLegend() + theme(axis.text.y = element_text(size = 6)))

   #print(head(these,40))
}

### get markers for pairwise discrimination

pair.markers <- FindMarkers(all.normTREE, ident.1 = 24, ident.2=25,min.pct = 0.5, logfc.threshold = 0.5)

   DoHeatmap(all.normTREE, features = rownames(pair.markers)) + NoLegend() + theme(axis.text.y = element_text(size = 8))

# get markers of groups

pair.markers <- FindMarkers(all.normTREE, ident.1 = c(15,16,17, 18, 19),min.pct = 0.5, logfc.threshold = 0.5)

   DoHeatmap(all.normTREE, features = rownames(pair.markers)) + NoLegend() + theme(axis.text.y = element_text(size = 8))

#tree maps————————————————————————–

Grab possible nodes using FindMarkers from seurat object, assuming you’ve already run BuildClusterTree()

nodes <- unique(all.normTREE@tools$BuildClusterTree$edge[,1])

tree_markers <- list()
for(i in 1:length(nodes)) {
#for(i in 1:2) {
   tree_markers[[i]] <- FindMarkers(
     all.normTREE, ident.1 = "clustertree", ident.2 = nodes[i], min.pct=0.5)
}

selecting markers that are greater than log2, sorting them and taking 10 most neg (left?) and 10 most pos (right?)

goodmarkers <- list()
leftMarkers <- list()
rightMarkers <- list()
for(i in 1:length(nodes)){
  these <- tree_markers[[i]]
  these <-these[which(abs(these$avg_logFC)>0.5),]
  these <- these[order(these$avg_logFC, decreasing = TRUE),]
  these$node_id <- nodes[[i]]
  these$gene_id<-rownames(these)
  leftMarkers[[i]] <- head(these,3)
  rightMarkers[[i]] <- tail(these,3)
  goodmarkers[[i]] <- these

}

#save out goodmarkers[[i]] and nodes to csv file

#rbind.fill(goodmarkers)
#df_goodtable <- ldply(goodmarkers, data.frame)
#capture.output(summary(df_goodtable), file = "/Users/deniseniell/Desktop/Seurat/run2/R2goodtable")
#write.csv(df_goodtable, file = "/Users/deniseniell/Desktop/Seurat/run2/R2goodmark_list.csv")

concatenate left and right markers for all nodes

allmarkers <- rownames(goodmarkers[[1]])
leftRightMarkers <- c(rownames(leftMarkers[[1]]),rownames(rightMarkers[[1]]))
for(i in 2:length(nodes)){
  allmarkers <- c(allmarkers,rownames(goodmarkers[[i]]))
  leftRightMarkers <- c(leftRightMarkers,rownames(leftMarkers[[i]]),rownames(rightMarkers[[i]]))
  
}

#heatmap for each node individually

for(i in 1:length(tree_markers)){
  i
  these <- tree_markers[[i]]
  these <-these[which(abs(these$avg_logFC)>0.5),]
  these <- these[order(these$avg_logFC, decreasing = TRUE),]
  these <- rbind(head(these,10),tail(these,10))
#print(these)

print(DoHeatmap(all.normTREE, features = rownames(these))+ NoLegend() + theme(axis.text.y = element_text(size = 6)))
  
}

## Warning in DoHeatmap(all.normTREE, features = rownames(these)): The following
## features were omitted as they were not found in the scale.data slot for the
## RNA assay: No hit-Ocbimv22011989m1, gene:Ocbimv22009322m.g1, EF-hand--EF-
## hand------EF-hand---calmodulin-1-phosphorylase-k-Ocbimv22032213m1, K+-channel-
## tetramerisation-domain-potassium-channel-tetramer-Ocbimv22001228m1, Sox-
## Ocbimv22024292m1, Fork-head-domain-forkhead-box-B1-Ocbimv22038677m1, Zinc-
## finger-C2H2-type--Sp9-transcription-factor-homolog-mous-Ocbimv22028516m1, C2H2-
## Ocbimv22028510m1

## Warning in DoHeatmap(all.normTREE, features = rownames(these)): The following
## features were omitted as they were not found in the scale.data slot for
## the RNA assay: VACHT-Ocbimv22001681m1, ChAT-Ocbimv22001674m1, AchE?!
## Confirm-Ocbimv22023426m1, gene:Ocbimv22017855m.g1, ATP citrate synthase-
## Ocbimv22001106m1, AMP-binding-enzyme--acyl-CoA-synthetase-short-chain-family-
## m-Ocbimv22017857m1, lactate/malate-dehydrogenase-NAD-binding-domain-lactate/
## mala-Ocbimv22001700m1, Nucleoside H+ symporter/Major Facilititator superfamily-
## Ocbimv22011289m1, Avidin-family-Avidin-family-Ocbimv22036279m1, Cadherin-
## Ocbimv22025965m1

## Warning in DoHeatmap(all.normTREE, features = rownames(these)): The following
## features were omitted as they were not found in the scale.data slot for the
## RNA assay: gene:Ocbimv22009865m.g1, Unknown function-Ocbimv22030969m1, No hit
## one weak to Voltage dependent calcium channel-Ocbimv22022356m1, Nkx2.1/Hox-
## Ocbimv22033333m1, Scarecrow/Hox-Ocbimv22022773m1

## Warning in DoHeatmap(all.normTREE, features = rownames(these)): The following
## features were omitted as they were not found in the scale.data slot for the RNA
## assay: Deoxyribonuclease-Ocbimv22015857m1, Lamin intermediate filament protein-
## Ocbimv22000068m1, Nucleoside H+ symporter/Major Facilititator superfamily-
## Ocbimv22011289m1, NAD-dependent-epimerase/dehydratase-family--3-beta-
## hydroxyst-Ocbimv22029814m1, Citron Rho interacting serine/threonine kinase-
## Ocbimv22027688m1, No hit Squid hit-Ocbimv22014258m1, Caprin family member-
## Ocbimv22030377m1, gene:Ocbimv22009865m.g1, Universal stress protein family-
## Ocbimv22023291m1, FMRF related peptide-Ocbimv22023842m1

## Warning in DoHeatmap(all.normTREE, features = rownames(these)): The following
## features were omitted as they were not found in the scale.data slot for the
## RNA assay: family-with-sequence-similarity-82-member-A2-Ocbimv22010248m1, K+-
## channel-tetramerisation-domain-potassium-channel-tetramer-Ocbimv22000108m1,
## Core-histone-H2A/H2B/H3/H4-TATA-box-binding-protein-associat-Ocbimv22035552m1,
## Lamin intermediate filament protein-Ocbimv22000068m1, Ankyrin-repeat-Ankyrin-
## repeat-----Ankyrin-repeat--Ankyrin-re-Ocbimv22017953m1

## Warning in DoHeatmap(all.normTREE, features = rownames(these)): The following
## features were omitted as they were not found in the scale.data slot for the RNA
## assay: gene:Ocbimv22033850m.g1

## Warning in DoHeatmap(all.normTREE, features = rownames(these)): The following
## features were omitted as they were not found in the scale.data slot for the RNA
## assay: Nucleoside H+ symporter/Major Facilititator superfamily-Ocbimv22011289m1,
## Double-stranded-RNA-binding-motif-Double-stranded-RNA-bindin-Ocbimv22005390m1

## Warning in DoHeatmap(all.normTREE, features = rownames(these)): The following
## features were omitted as they were not found in the scale.data slot for the
## RNA assay: Cadherin-like-Cadherin-domain-Cadherin-domain-Cadherin-domai-
## Ocbimv22020261m1

#dotplot for one node individually

i<-14
  i
## [1] 14
  these <- tree_markers[[i]]
  these <-these[which(abs(these$avg_logFC)>0),]
  these <- these[order(these$avg_logFC, decreasing = TRUE),]
  these <- rbind(head(these,10),tail(these,10))

DotPlot(all.normTREE,features=rownames(these))+ RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

#Heatmap of all concatenated node data

DoHeatmap(all.normTREE, features = leftRightMarkers) + NoLegend() + theme(axis.text.y = element_text(size = 4))

map developmental genes

genelist <- vector()
nomatch <- list()
for (i in 1:129){
  gene <- ref[[i,1]]
  gene<-substr(gene,7,str_length(gene)-1)
  loc <- grep(gene,all.genes)
  if (length(loc)>0) {
    genelist <- c(genelist,loc)
  } else {
    nomatch <- c(nomatch,ref[[i,2]])
  }
}
DoHeatmap(all.normTREE, features = all.genes[genelist],disp.min = -1.5, disp.max = 1.5)  + theme(axis.text.y = element_text(size = 4))

map neural genes

genelist <- vector();
nomatch <- list();
for (i in 130:207){
  gene <- ref[[i,1]]
  gene<-substr(gene,7,str_length(gene)-1)
  loc <- grep(gene,all.genes)
  if (length(loc)>0) {
    genelist <- c(genelist,loc)
  } else {
    nomatch <- c(nomatch,ref[[i,2]])
  }
}
DoHeatmap(all.normTREE, features = all.genes[genelist], disp.min = -1.5, disp.max = 1.5)  + theme(axis.text.y = element_text(size = 5))

#map cadherins

DoHeatmap(all.normTREE, features = all.genes[grep("Cadherin-O",all.genes)], disp.min = -1.5, disp.max = 1.5)  + theme(axis.text.y = element_text(size = 8))

#heatmap of yfg (your favorite gene)

yfg <- read.csv("D:/data/octo seq/Genes for in situ.csv",stringsAsFactors=FALSE)

genelist <- vector()
nomatch <- list()
for (i in 1:22){
  gene <- yfg[[i,2]]
  #gene<-substr(gene,7,str_length(gene)-1)
  loc <- grep(gene,all.genes)
  if (length(loc)>0) {
    genelist <- c(genelist,loc)
  } else {
    nomatch <- c(nomatch,yfg[[i,2]])
  }
}
DoHeatmap(all.normTREE, features = all.genes[genelist],disp.min = -1.5, disp.max = 1.5)  + theme(axis.text.y = element_text(size = 8))

DotPlot(all.normTREE,features=rev(all.genes[genelist])) + RotatedAxis()

FeaturePlot(all.normTREE,features = all.genes[genelist], ncol = 1) + NoLegend() + NoAxes()

#heatmap of yfg (your favorite gene)

yfg <- read.csv("D:/data/octo seq/GeneIDs - More Neuro.csv",stringsAsFactors=FALSE)
yfg.unique = unique(yfg[,2])
genelist <- vector()
nomatch <- list()
for (i in 1:length(yfg.unique)){
  gene <- yfg.unique[i]
#gene<-substr(gene,1,str_length(gene)-1)
  loc <- grep(gene,all.genes)
  if (length(loc)>0) {
    genelist <- c(genelist,loc)
  } else {
    nomatch <- c(nomatch,yfg[[i,2]])
  }
}
DotPlot(all.normTREE,features=rev(all.genes[genelist[1:100]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

DotPlot(all.normTREE,features=rev(all.genes[genelist[101:200]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

DotPlot(all.normTREE,features=rev(all.genes[genelist[201:270]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

#heatmap of gpcrs

yfg <- read.csv("D:/data/octo seq/GeneIDs - All GPCRs.csv",stringsAsFactors=FALSE)

genelist <- vector()
nomatch <- list()
for (i in 1:327){
  gene <- yfg[[i,2]]
#gene<-substr(gene,1,str_length(gene)-1)
  loc <- grep(gene,all.genes)
  if (length(loc)>0) {
    genelist <- c(genelist,loc)
  } else {
    nomatch <- c(nomatch,yfg[[i,2]])
  }
}
DotPlot(all.normTREE,features=rev(all.genes[genelist[1:50]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

DotPlot(all.normTREE,features=rev(all.genes[genelist[51:100]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

DotPlot(all.normTREE,features=rev(all.genes[genelist[101:150]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

DotPlot(all.normTREE,features=rev(all.genes[genelist[151:200]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

DotPlot(all.normTREE,features=rev(all.genes[genelist[201:243]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

zinc <- grep("c2h2",all.genes,ignore.case=TRUE)
DotPlot(all.normTREE,features=rev(all.genes[zinc[1:500]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 4))

DotPlot(all.normTREE,features=rev(all.genes[zinc[501:1000]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 4))

DotPlot(all.normTREE,features=rev(all.genes[zinc[1001:1500]])) + RotatedAxis()+ theme(axis.text.x = element_text(size = 4))

#krab c2h2 zinc fingers are noted in albertin et al. probably not getting them all
DotPlot(all.normTREE,features=all.genes[grep("krab",all.genes,ignore.case=TRUE)]) + RotatedAxis()+ theme(axis.text.x = element_text(size = 4))

DotPlot(all.normTREE,features=all.genes[grep("hox",all.genes,ignore.case=TRUE)]) + RotatedAxis()+ theme(axis.text.x = element_text(size = 4))

cadh <-grep("cadherin",all.genes,ignore.case=TRUE)


DotPlot(all.normTREE,features=all.genes[cadh[1:50]]) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

DotPlot(all.normTREE,features=all.genes[cadh[51:100]]) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

DotPlot(all.normTREE,features=all.genes[cadh[101:150]]) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))

DotPlot(all.normTREE,features=all.genes[cadh[151:193]]) + RotatedAxis()+ theme(axis.text.x = element_text(size = 6))
## Warning in FetchData(object = object, vars = features): The following requested
## variables were not found: NA

FeaturePlot(all.normTREE,features = all.genes[grep("protocadherin",all.genes,ignore.case=TRUE)])

print(nomatch)
## [[1]]
## [1] "Ocbimv22001825m"
## 
## [[2]]
## [1] "Ocbimv22002970m"
## 
## [[3]]
## [1] "Ocbimv22002972m"
## 
## [[4]]
## [1] "Ocbimv22002973m"
## 
## [[5]]
## [1] "Ocbimv22004108m"
## 
## [[6]]
## [1] "Ocbimv22006261m"
## 
## [[7]]
## [1] "Ocbimv22006262m"
## 
## [[8]]
## [1] "Ocbimv22006266m"
## 
## [[9]]
## [1] "Ocbimv22006948m"
## 
## [[10]]
## [1] "Ocbimv22009549m"
## 
## [[11]]
## [1] "Ocbimv22009853m"
## 
## [[12]]
## [1] "Ocbimv22009966m"
## 
## [[13]]
## [1] "Ocbimv22010055m"
## 
## [[14]]
## [1] "Ocbimv22011033m"
## 
## [[15]]
## [1] "Ocbimv22011096m"
## 
## [[16]]
## [1] "Ocbimv22011212m"
## 
## [[17]]
## [1] "Ocbimv22011397m"
## 
## [[18]]
## [1] "Ocbimv22011400m"
## 
## [[19]]
## [1] "Ocbimv22011405m"
## 
## [[20]]
## [1] "Ocbimv22011406m"
## 
## [[21]]
## [1] "Ocbimv22011776m"
## 
## [[22]]
## [1] "Ocbimv22012294m"
## 
## [[23]]
## [1] "Ocbimv22012937m"
## 
## [[24]]
## [1] "Ocbimv22013682m"
## 
## [[25]]
## [1] "Ocbimv22013759m"
## 
## [[26]]
## [1] "Ocbimv22013760m"
## 
## [[27]]
## [1] "Ocbimv22016114m"
## 
## [[28]]
## [1] "Ocbimv22016116m"
## 
## [[29]]
## [1] "Ocbimv22016427m"
## 
## [[30]]
## [1] "Ocbimv22019232m"
## 
## [[31]]
## [1] "Ocbimv22019923m"
## 
## [[32]]
## [1] "Ocbimv22020068m"
## 
## [[33]]
## [1] "Ocbimv22020136m"
## 
## [[34]]
## [1] "Ocbimv22020457m"
## 
## [[35]]
## [1] "Ocbimv22021289m"
## 
## [[36]]
## [1] "Ocbimv22021291m"
## 
## [[37]]
## [1] "Ocbimv22021509m"
## 
## [[38]]
## [1] "Ocbimv22022047m"
## 
## [[39]]
## [1] "Ocbimv22022873m"
## 
## [[40]]
## [1] "Ocbimv22024696m"
## 
## [[41]]
## [1] "Ocbimv22024958m"
## 
## [[42]]
## [1] "Ocbimv22025444m"
## 
## [[43]]
## [1] "Ocbimv22025567m"
## 
## [[44]]
## [1] "Ocbimv22025569m"
## 
## [[45]]
## [1] "Ocbimv22026028m"
## 
## [[46]]
## [1] "Ocbimv22026939m"
## 
## [[47]]
## [1] "Ocbimv22027432m"
## 
## [[48]]
## [1] "Ocbimv22027591m"
## 
## [[49]]
## [1] "Ocbimv22027699m"
## 
## [[50]]
## [1] "Ocbimv22029288m"
## 
## [[51]]
## [1] "Ocbimv22029408m"
## 
## [[52]]
## [1] "Ocbimv22029503m"
## 
## [[53]]
## [1] "Ocbimv22030827m"
## 
## [[54]]
## [1] "Ocbimv22030828m"
## 
## [[55]]
## [1] "Ocbimv22030882m"
## 
## [[56]]
## [1] "Ocbimv22031366m"
## 
## [[57]]
## [1] "Ocbimv22031367m"
## 
## [[58]]
## [1] "Ocbimv22032407m"
## 
## [[59]]
## [1] "Ocbimv22032659m"
## 
## [[60]]
## [1] "Ocbimv22033043m"
## 
## [[61]]
## [1] "Ocbimv22033849m"
## 
## [[62]]
## [1] "Ocbimv22034321m"
## 
## [[63]]
## [1] "Ocbimv22034322m"
## 
## [[64]]
## [1] "Ocbimv22034324m"
## 
## [[65]]
## [1] "Ocbimv22034325m"
## 
## [[66]]
## [1] "Ocbimv22034326m"
## 
## [[67]]
## [1] "Ocbimv22034328m"
## 
## [[68]]
## [1] "Ocbimv22034329m"
## 
## [[69]]
## [1] "Ocbimv22034330m"
## 
## [[70]]
## [1] "Ocbimv22034332m"
## 
## [[71]]
## [1] "Ocbimv22034333m"
## 
## [[72]]
## [1] "Ocbimv22034334m"
## 
## [[73]]
## [1] "Ocbimv22034590m"
## 
## [[74]]
## [1] "Ocbimv22036037m"
## 
## [[75]]
## [1] "Ocbimv22036485m"
## 
## [[76]]
## [1] "Ocbimv22036695m"
## 
## [[77]]
## [1] "Ocbimv22036699m"
## 
## [[78]]
## [1] "Ocbimv22037101m"
## 
## [[79]]
## [1] "Ocbimv22037715m"
## 
## [[80]]
## [1] "Ocbimv22038378m"
## 
## [[81]]
## [1] "Ocbimv22038381m"
## 
## [[82]]
## [1] "Ocbimv22038642m"
## 
## [[83]]
## [1] "Ocbimv22039182m"
## 
## [[84]]
## [1] "Ocbimv22039241m"
FeaturePlot(all.normTREE,features = all.genes[genelist[13:22]]) + NoLegend() + NoAxes()

FeaturePlot(all.normTREE,features = all.genes[grep("Synaptotagmin",all.genes)],max.cutoff = 10)

FeaturePlot(all.normTREE,features = all.genes[grep("021175",all.genes)]) + 
scale_color_gradientn( colours = c('lightgrey', 'blue'),  limits = c(0, 8))
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

FeaturePlot(all.normTREE,features = all.genes[grep("VACHT",all.genes)])

FeaturePlot(all.normTREE,features = all.genes[grep("VGlut",all.genes)])

FeaturePlot(all.normTREE,features = all.genes[grep("TH-O",all.genes)])

FeaturePlot(all.normTREE,features = all.genes[grep("TyrBH",all.genes)])

FeaturePlot(all.normTREE,features = all.genes[grep("FMRF amide",all.genes)])

FeaturePlot(all.normTREE,features = all.genes[grep("FMRF related",all.genes)])

FeaturePlot(all.normTREE,features = all.genes[grep("000748",all.genes)])

FeaturePlot(all.normTREE,features = all.genes[grep("25965",all.genes)])

FeaturePlot(all.normTREE,features = all.genes[grep("glutsyn",all.genes,ignore.case=TRUE)])